Let's start importing some basic packages.
from sodapy import Socrata
import json
import pickle
import ast
import os
import sys
import itertools
from random import randrange
import pandas as pd
import numpy as np
import psycopg2
from datetime import datetime, timedelta
import time
import dateutil
The current path of the application and its parent directory are important to move around the different folders of the application.
currentdir = os.getcwd()
parentdir = os.path.abspath(os.path.join(currentdir, os.pardir))
'C:\\Users\\rcmpo\\OneDrive\\Escritorio\\TFM\\application'
Let's add the utils folder to the Python runtime environment using 'sys' library. It allows importing functions within our application without writing them again here in the notebook.
sys.path.insert(0, parentdir + '\\' + 'utils')
from db_utils import (DBConnection,
get_all_link_ids,
get_all_boroughs,
get_coordinates_from_link_id,
get_borough_from_link_id,
get_centroid_from_borough,
get_most_recent_date_stored_for_each_link_id)
from db_utils import (SocrataAPI,
query_flow_to_store_data_in_database,
get_avg_and_quantiles_speed_from_historical_data,
get_real_time_traffic_data)
Our Prophet model needs information about when New York Knicks plays at home in Madison Square Garden. Using 'requests' library and navigating around the NBA official website we can find this URL where all matches of the season were included. We create a list with all the start hour of NYK home matches.
import urllib.request, json
with urllib.request.urlopen('https://es.global.nba.com/stats2/team/schedule.json?countryCode=ES&locale=es&teamCode=knicks') as url:
data = json.loads(url.read().decode())
home_dates = []
for month in range(len(data['payload']['monthGroups'])):
for match in range(len(data['payload']['monthGroups'][month]['games'])):
if data['payload']['monthGroups'][month]['games'][match]['homeTeam']['profile']['abbr'] == 'NYK':
home_dates.append(data['payload']['monthGroups'][month]['games'][match]['profile']['dateTimeEt'])
home_dates = [dateutil.parser.isoparse(x).replace(minute = 0).isoformat() if dateutil.parser.isoparse(x).minute == 30 else dateutil.parser.isoparse(x).isoformat() for x in home_dates] # Check that datetimes follow datetime format.
home_dates
['2021-10-05T19:00:00', '2021-10-13T19:00:00', '2021-10-15T19:00:00', '2021-10-20T19:00:00', '2021-10-24T19:00:00', '2021-10-26T19:00:00', '2021-11-01T19:00:00', '2021-11-07T18:00:00', '2021-11-10T19:00:00', '2021-11-15T19:00:00', '2021-11-17T19:00:00', '2021-11-20T17:00:00', '2021-11-23T19:00:00', '2021-11-26T19:00:00', '2021-12-02T19:00:00', '2021-12-04T13:00:00', '2021-12-12T12:00:00', '2021-12-14T19:00:00', '2021-12-21T19:00:00', '2021-12-23T19:00:00', '2021-12-25T12:00:00', '2022-01-04T19:00:00', '2022-01-06T19:00:00', '2022-01-10T19:00:00', '2022-01-12T19:00:00', '2022-01-17T13:00:00', '2022-01-18T19:00:00', '2022-01-20T19:00:00', '2022-01-23T13:00:00', '2022-01-31T19:00:00', '2022-02-02T19:00:00', '2022-02-14T19:00:00', '2022-02-16T19:00:00', '2022-02-25T19:00:00', '2022-02-27T13:00:00', '2022-03-16T19:00:00', '2022-03-18T19:00:00', '2022-03-20T19:00:00', '2022-03-22T19:00:00', '2022-03-28T19:00:00', '2022-03-30T19:00:00', '2022-04-02T13:00:00', '2022-04-06T19:00:00', '2022-04-10T13:00:00']
And also let's create two functions to add to DataFrame with the purpose of creating new columns on it which works as regressor when the model is trained. Note that, in order to see the impact of a NBA match in the traffic state, it is not enough just getting the match start hour, but also a hour before to its beginning. We will include both hours in the DataFrame using below function.
nba_schedule_url = 'https://es.global.nba.com/stats2/team/schedule.json?countryCode=ES&locale=es&teamCode=knicks'
def game_in_madison_square_garden(date):
with urllib.request.urlopen(nba_schedule_url) as url:
data = json.loads(url.read().decode())
home_dates = []
for month in range(len(data['payload']['monthGroups'])):
for match in range(len(data['payload']['monthGroups'][month]['games'])):
if data['payload']['monthGroups'][month]['games'][match]['homeTeam']['profile']['abbr'] == 'NYK':
home_dates.append(data['payload']['monthGroups'][month]['games'][match]['profile']['dateTimeEt'])
home_dates = [dateutil.parser.isoparse(x).replace(minute = 0).isoformat() if dateutil.parser.isoparse(x).minute == 30 else dateutil.parser.isoparse(x).isoformat() for x in home_dates] # Check that datetimes follow datetime format.
parsed_dates = [dateutil.parser.isoparse(x) for x in home_dates]
previous_hour = [(x - timedelta(hours = 1)).isoformat() for x in parsed_dates]
model_dates = home_dates + previous_hour
if date in [model_dates]:
return 1
else:
return 0
def is_weekend(ds):
date = pd.to_datetime(ds)
if date.weekday() > 4:
return 1
else:
return 0
Let's also define two functions to calculate model error metrics: RMSE and MAE.
def rmse(actual: np.ndarray, predicted: np.ndarray):
length = len(actual)
return np.sqrt((sum((actual - predicted) ** 2)) / length)
def mae(actual: np.ndarray, predicted: np.ndarray):
return np.mean(abs(actual - predicted))
We have defined all the required utils to train the model. Now, let's create the dataset for the first highway.
link_id = 1
query = f"SELECT * from tb_historical_traffic WHERE link_id = {link_id}"
db = DBConnection()
# Return the query to the local database as a DataFrame sorted by date.
df = db.perform_query(query).sort_values(by = 'measurement_date')
# Filter the columns that we need to the training and set the date as index to resample the DataFrame later.
df = df[['measurement_date', 'speed']]
df = df.set_index('measurement_date')
df.index = pd.to_datetime(df.index)
# Filter the outliers using the IQR approach.
q3, q1 = np.quantile(df['speed'], 0.75), np.quantile(df['speed'], 0.25)
iqr = q3 - q1
upper_limit, lower_limit = q3 + 1.5 * iqr, q1 - 1.5 * iqr
df = df[(df['speed'] > lower_limit) & (df['speed'] < upper_limit)]
# Resample the DataFrame by hour using the average of the observations.
df = df.resample('1H').mean()
df = df.reset_index(drop = False)
# Rename the columns to the names that Prophet supports.
df = df.rename(columns = {'measurement_date': 'ds', 'speed': 'y'})
df
Performing connection with database... Connection successful ! Performing query to database... Query finished ! Connection closed.
| ds | y | |
|---|---|---|
| 0 | 2017-05-05 09:00:00 | 8.700000 |
| 1 | 2017-05-05 10:00:00 | 8.390000 |
| 2 | 2017-05-05 11:00:00 | 6.318333 |
| 3 | 2017-05-05 12:00:00 | NaN |
| 4 | 2017-05-05 13:00:00 | 3.110000 |
| ... | ... | ... |
| 41961 | 2022-02-16 18:00:00 | 1.505714 |
| 41962 | 2022-02-16 19:00:00 | 3.257500 |
| 41963 | 2022-02-16 20:00:00 | 2.015000 |
| 41964 | 2022-02-16 21:00:00 | 3.257500 |
| 41965 | 2022-02-16 22:00:00 | 3.100000 |
41966 rows × 2 columns
This is how the DataFrame looks, however, we need to apply also the two functions created before to use the new columns as regressor in the training. Let's do it.
df['weekend'] = df['ds'].apply(is_weekend)
df['knicks_game'] = df['ds'].apply(match_in_madison_square_garden)
df
| ds | y | weekend | knicks_game | |
|---|---|---|---|---|
| 0 | 2017-05-05 09:00:00 | 8.700000 | 0 | 0 |
| 1 | 2017-05-05 10:00:00 | 8.390000 | 0 | 0 |
| 2 | 2017-05-05 11:00:00 | 6.318333 | 0 | 0 |
| 3 | 2017-05-05 12:00:00 | NaN | 0 | 0 |
| 4 | 2017-05-05 13:00:00 | 3.110000 | 0 | 0 |
| ... | ... | ... | ... | ... |
| 41961 | 2022-02-16 18:00:00 | 1.505714 | 0 | 0 |
| 41962 | 2022-02-16 19:00:00 | 3.257500 | 0 | 0 |
| 41963 | 2022-02-16 20:00:00 | 2.015000 | 0 | 0 |
| 41964 | 2022-02-16 21:00:00 | 3.257500 | 0 | 0 |
| 41965 | 2022-02-16 22:00:00 | 3.100000 | 0 | 0 |
41966 rows × 4 columns
Apart from weekends and matches in the Madison Square Garden, Prophet supports information about holidays. Let's create another DataFrame using the 'holidays' Python package.
import holidays
nyc_holidays = pd.DataFrame(holidays.CountryHoliday('US', state = 'NY', years = 2021).items()).rename(columns = {0: 'ds', 1: 'holiday'})
nyc_holidays
| ds | holiday | |
|---|---|---|
| 0 | 2021-01-01 | New Year's Day |
| 1 | 2021-12-31 | New Year's Day (Observed) |
| 2 | 2021-01-18 | Martin Luther King Jr. Day |
| 3 | 2021-02-12 | Lincoln's Birthday |
| 4 | 2021-02-15 | Washington's Birthday, Susan B. Anthony Day |
| 5 | 2021-05-31 | Memorial Day |
| 6 | 2021-06-19 | Juneteenth National Independence Day |
| 7 | 2021-06-18 | Confederate Memorial Day (Observed) |
| 8 | 2021-07-04 | Independence Day |
| 9 | 2021-07-05 | Independence Day (Observed) |
| 10 | 2021-09-06 | Labor Day |
| 11 | 2021-10-11 | Columbus Day |
| 12 | 2021-11-02 | Election Day |
| 13 | 2021-11-11 | Veterans Day |
| 14 | 2021-11-25 | Thanksgiving |
| 15 | 2021-12-25 | Christmas Day |
| 16 | 2021-12-24 | Christmas Day (Observed) |
Cross Validation and Grid Search is performed here below. We have selected 3 different parameters to tune, having 18 different combinations in total. We predict in 6 different periods for each hyper parameter combination using a rolling window cross validation and after that, we take the average of the 6 periods RMSE. The lowest RMSE is the best one and its parameters combination associated to it is selected to test again the model on test data.
from fbprophet import Prophet
from fbprophet.serialize import model_to_json, model_from_json
param_grid = {'changepoint_prior_scale': [0.01, 0.1, 0.5],
'changepoint_range': [0.8, 0.9, 0.95],
'seasonality_mode': ['multiplicative', 'additive']}
# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
grid_search_df = pd.DataFrame()
avg_RMSEs = []
SAVE_MODEL = True
for params in all_params:
print(f'############## {params} ##############')
start_train_date = (datetime.now() - timedelta(days = 365)).replace(hour = 0, minute = 0, second = 0, microsecond = 0)
end_train_date = start_train_date + timedelta(days = 30 * 8)
stop_validation = end_train_date + timedelta(days = 30 * 3)
RMSEs = {}
period_n = 0
while True:
start_val_date = end_train_date
end_val_date = start_val_date + timedelta(days = 14)
if end_val_date > stop_validation:
print(f'\nDate limit ({stop_validation}) to validate data reached!')
end_val_date = start_val_date
break
print(f'Training data between {start_train_date}, and {end_train_date}')
train_data = df[(df['ds'] > start_train_date.strftime('%Y-%m-%d')) & (df['ds'] < end_train_date.strftime('%Y-%m-%d'))]
val_data = df[(df['ds'] > start_val_date.strftime('%Y-%m-%d')) & (df['ds'] < end_val_date.strftime('%Y-%m-%d'))]
m = Prophet(**params,
yearly_seasonality = True,
weekly_seasonality = True,
daily_seasonality = True,
holidays = nyc_holidays)
m.add_regressor('weekend')
m.add_regressor('knicks_game')
m.fit(train_data)
future = m.make_future_dataframe(periods = 360, freq = 'H')
future['weekend'] = future['ds'].apply(is_weekend)
future['knicks_game'] = future['ds'].apply(match_in_madison_square_garden)
forecast = m.predict(future)
forecast = forecast[(forecast['ds'] > start_val_date.isoformat()) & (forecast['ds'] < end_val_date.isoformat())]
metrics_df = val_data.merge(right = forecast, how = 'inner', left_on = 'ds', right_on = 'ds')[['ds', 'y', 'yhat']].dropna()
metrics_df = metrics_df.set_index('ds')
metrics_df = metrics_df.dropna()
actual, predicted = np.array(metrics_df['y'].values), np.array(metrics_df['yhat'].values)
rmse_ = rmse(actual, predicted)
RMSEs[period_n] = rmse_
print(f'Validation metrics between {start_val_date} and {end_val_date} -> RMSE: {rmse_}\n')
start_train_date = start_train_date + timedelta(days = 14)
end_train_date = end_train_date + timedelta(days = 14)
period_n += 1
grid_search_df = grid_search_df.append(RMSEs, ignore_index = True)
avg_rmse, std_rmse = np.round(np.mean([x for x in RMSEs.values()]), 3), np.round(np.std([x for x in RMSEs.values()]), 3)
avg_RMSEs.append(avg_rmse)
print('')
print(f'-> Average RMSE using 14-days Cross Validation: {avg_rmse} ± {std_rmse}')
print('')
print('All Cross Validation finished!')
grid_search_df['Avg. RMSE'] = avg_RMSEs
final_grid_search_df = pd.concat([pd.DataFrame(all_params), grid_search_df], axis = 1)
best_df = final_grid_search_df[final_grid_search_df.index == final_grid_search_df['Avg. RMSE'].idxmin()][[x for x in final_grid_search_df.columns if x not in [0, 1, 2, 3, 4, 5, 'Avg. RMSE']]]
best_params = best_df.to_dict(orient = 'records')[0]
print(f'\n --- Best params: {best_params} ---\n')
# Test data after the latest validation performed !
print(f'Testing data after {end_val_date} with optimal model...')
test_data = df[df['ds'] > end_val_date]
if SAVE_MODEL:
m = Prophet(**best_params,
yearly_seasonality = True,
weekly_seasonality = True,
daily_seasonality = True,
holidays = nyc_holidays)
m.add_regressor('weekend')
m.add_regressor('knicks_game')
m.fit(test_data)
print('Done!')
with open(f'C://Users//rcmpo//OneDrive//Escritorio//TFM//application//models//{link_id}.json', 'w') as fout:
json.dump(model_to_json(m), fout)
else:
with open(f'C://Users//rcmpo//OneDrive//Escritorio//TFM//application//models//{link_id}.json', 'r') as fin:
m = model_from_json(json.load(fin))
future = m.make_future_dataframe(periods = 360, freq = 'H')
future['weekend'] = future['ds'].apply(is_weekend)
future['knicks_game'] = future['ds'].apply(match_in_madison_square_garden)
forecast = m.predict(future)
predictions = test_data.merge(right = forecast[['ds', 'yhat']], how = 'inner', left_on = 'ds', right_on = 'ds')[['ds', 'y', 'yhat']].dropna()
print(f"RMSE for test data: {rmse(predictions['y'].values, predictions['yhat'].values)}")
predictions['mae'] = abs(predictions['y'] - predictions['yhat'])
print(f"MAE for test data: {np.mean(predictions['mae'])}")
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.8, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 3.548646658205556
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.613006458084591
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 3.632000382607575
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 8.0269451308222
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 10.388647247407453
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.406160683206686
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 6.603 ± 2.421
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.8, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 4.652056423383575
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 5.473010079561313
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 5.1575973841360785
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 12.604162859000365
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 10.642418533833725
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 8.301993736749074
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 7.805 ± 2.992
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.9, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 3.5461079281709367
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.28283493551903
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 3.997358557614436
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.474140878055749
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 10.089199286577944
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 6.897272828262811
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 6.214 ± 2.144
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 3.8670076830877917
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 5.63497170509369
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 5.455861516143279
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 12.607277360765417
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 9.868039423336139
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.399785204990593
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 7.472 ± 2.96
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 3.55927597579024
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 4.320758976566218
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 3.6552906226437396
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 8.65193814350648
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 9.59272884282056
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 6.825586256132988
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 6.101 ± 2.41
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.95, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 5.176063481075989
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 5.246194067116321
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 5.60541189659985
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 12.746027506393263
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 10.849700439219339
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.064610372436119
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 7.781 ± 2.959
############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.8, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.438099449051967
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 15.225417004775556
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 5.461090911306057
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 25.670853467143967
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 10.511810667588907
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 15.661904246516121
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 13.162 ± 6.813
############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.8, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 23.088028702679495
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 54.26831727551163
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.649864281644529
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 80.48008489074928
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 39.18562086718136
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 25.554544778810417
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 38.538 ± 23.479
############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.9, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 16.166773733282824
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 12.262770322869365
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 4.208400666189378
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 22.095670830857987
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 5.909290497636558
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 16.419447805969714
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 12.844 ± 6.224
############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 23.697288653402477
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 37.18942994202489
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.945632125009102
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 80.36649015106038
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 55.52353299375275
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 33.161773834311475
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 39.981 ± 22.709
############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 9.601623318359167
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 14.360614350237798
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 5.317535200236071
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 17.67919410715513
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 6.44147173839853
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 15.296656862496901
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 11.45 ± 4.622
############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.95, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 45.622545586315944
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 53.91577969924726
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.255825168069293
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 47.93112279623359
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 18.919766147483966
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 9.602383949918279
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 30.875 ± 18.718
############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.8, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 8.605729360248946
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 7.247944327539396
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 6.3039927163062055
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 26.199906638649587
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 7.899615446808091
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 15.955709437681783
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 12.035 ± 7.079
############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.8, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 32.08959856678633
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 4.5904067082829485
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 18.704632000261213
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 55.88247304266387
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 9.645499836538812
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 13.142762449128305
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 22.343 ± 17.292
############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.9, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 30.597074777247894
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 5.7715415827584495
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 6.196917717720455
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 20.99420444917006
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 5.337882104958939
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 16.756685997785464
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 14.276 ± 9.444
############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 8.504746353897035
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 4.690421641254657
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 40.23232143331802
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 64.84724341570151
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 24.466934146201954
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 25.374508972432185
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 28.019 ± 20.208
############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 27.286764449027377
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 25.73645834990501
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 5.8374698003813545
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 15.03448071920315
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 4.507477424801021
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 20.003246457253233
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 16.401 ± 8.884
############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.95, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 44.751237948803414
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 36.87242700158772
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.107100784828411
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 66.8029758882345
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 17.419691444545155
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 23.26918471165233
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 33.037 ± 19.166
All Cross Validation finished!
--- Best params: {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ---
Testing data after 2022-01-16 00:00:00 with optimal model...
Done!
RMSE for test data: 3.14451660725604
MAE for test data: 2.1124367023432167
The final results of the model training are collected in the following DataFrame (0, 1, 2, 3, 4 and 5 are the different periods using for cross validation)
final_grid_search_df
| changepoint_prior_scale | changepoint_range | seasonality_mode | 0 | 1 | 2 | 3 | 4 | 5 | Avg. RMSE | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.01 | 0.80 | multiplicative | 3.548647 | 6.613006 | 3.632000 | 8.026945 | 10.388647 | 7.406161 | 6.603 |
| 1 | 0.01 | 0.80 | additive | 4.652056 | 5.473010 | 5.157597 | 12.604163 | 10.642419 | 8.301994 | 7.805 |
| 2 | 0.01 | 0.90 | multiplicative | 3.546108 | 6.282835 | 3.997359 | 6.474141 | 10.089199 | 6.897273 | 6.214 |
| 3 | 0.01 | 0.90 | additive | 3.867008 | 5.634972 | 5.455862 | 12.607277 | 9.868039 | 7.399785 | 7.472 |
| 4 | 0.01 | 0.95 | multiplicative | 3.559276 | 4.320759 | 3.655291 | 8.651938 | 9.592729 | 6.825586 | 6.101 |
| 5 | 0.01 | 0.95 | additive | 5.176063 | 5.246194 | 5.605412 | 12.746028 | 10.849700 | 7.064610 | 7.781 |
| 6 | 0.10 | 0.80 | multiplicative | 6.438099 | 15.225417 | 5.461091 | 25.670853 | 10.511811 | 15.661904 | 13.162 |
| 7 | 0.10 | 0.80 | additive | 23.088029 | 54.268317 | 8.649864 | 80.480085 | 39.185621 | 25.554545 | 38.538 |
| 8 | 0.10 | 0.90 | multiplicative | 16.166774 | 12.262770 | 4.208401 | 22.095671 | 5.909290 | 16.419448 | 12.844 |
| 9 | 0.10 | 0.90 | additive | 23.697289 | 37.189430 | 9.945632 | 80.366490 | 55.523533 | 33.161774 | 39.981 |
| 10 | 0.10 | 0.95 | multiplicative | 9.601623 | 14.360614 | 5.317535 | 17.679194 | 6.441472 | 15.296657 | 11.450 |
| 11 | 0.10 | 0.95 | additive | 45.622546 | 53.915780 | 9.255825 | 47.931123 | 18.919766 | 9.602384 | 30.875 |
| 12 | 0.50 | 0.80 | multiplicative | 8.605729 | 7.247944 | 6.303993 | 26.199907 | 7.899615 | 15.955709 | 12.035 |
| 13 | 0.50 | 0.80 | additive | 32.089599 | 4.590407 | 18.704632 | 55.882473 | 9.645500 | 13.142762 | 22.343 |
| 14 | 0.50 | 0.90 | multiplicative | 30.597075 | 5.771542 | 6.196918 | 20.994204 | 5.337882 | 16.756686 | 14.276 |
| 15 | 0.50 | 0.90 | additive | 8.504746 | 4.690422 | 40.232321 | 64.847243 | 24.466934 | 25.374509 | 28.019 |
| 16 | 0.50 | 0.95 | multiplicative | 27.286764 | 25.736458 | 5.837470 | 15.034481 | 4.507477 | 20.003246 | 16.401 |
| 17 | 0.50 | 0.95 | additive | 44.751238 | 36.872427 | 9.107101 | 66.802976 | 17.419691 | 23.269185 | 33.037 |
Best parameters are:
And with these parameters we get a metrics on test data of:
Let's export the DataFrame in order to use it in the dissertation.
import dataframe_image as dfi
dfi.export(final_grid_search_df, r'C:\Users\rcmpo\OneDrive\Escritorio\TFM\memory_utils\final_grid_search_df.png', max_rows = 20)
Finally, let's compare the results with a seasonality Naïve and a 3-weeks seasonality Moving average model:
naive = pd.concat([val_data, test_data]).reset_index(drop = True)
naive = naive[naive['ds'] > '2022-01-14 00:00:00']
naive = naive.set_index('ds')
naive = naive.sort_index()
naive['yhat'] = naive['y'].shift(periods = 24, freq = 'H')
naive = naive.dropna()
naive['mae'] = abs(naive['y'] - naive['yhat'])
print(f"MAE for test data: {np.mean(naive['mae'])}")
rmse_ = rmse(naive['y'].values, naive['yhat'].values)
print(f"RMSE for test data: {rmse_}")
MAE for test data: 2.3880420414116545 RMSE for test data: 3.759215566389097
def get_previous_dates_given_a_date(date):
dates = [date - timedelta(days = 7), date - timedelta(days = 14), date - timedelta(days = 21)]
return dates
pd.options.display.max_rows = 20
moving_avg = pd.concat([train_data, val_data, test_data]).reset_index(drop = True)
moving_avg = moving_avg.drop(columns = ['weekend', 'knicks_game'])
moving_avg['yhat'] = None
moving_avg = moving_avg.set_index('ds')
moving_avg = moving_avg.sort_index()
moving_avg = moving_avg[moving_avg.index > '2021-12-01']
for idx in moving_avg.index:
previous_dates = get_previous_dates_given_a_date(idx)
speeds = moving_avg[moving_avg.index.isin(previous_dates)]['y'].values
avg = np.mean([x for x in speeds if str(x) != 'nan'])
moving_avg.at[idx, 'yhat'] = avg
moving_avg = moving_avg[moving_avg.index > '2022-01-15 00:00:00']
moving_avg = moving_avg.dropna()
moving_avg['mae'] = abs(moving_avg['y'] - moving_avg['yhat'])
print(f"MAE for test data: {np.mean(moving_avg['mae'])}")
rmse_ = rmse(moving_avg['y'].values, moving_avg['yhat'].values)
print(f"RMSE for test data: {rmse_}")
C:\Users\rcmpo\Anaconda3\envs\tfm_\lib\site-packages\numpy\core\fromnumeric.py:3440: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\rcmpo\Anaconda3\envs\tfm_\lib\site-packages\numpy\core\_methods.py:189: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount)
MAE for test data: 2.2335440682768026 RMSE for test data: 3.578375165987913
final_comparison = pd.DataFrame(index = ['MAE', 'RMSE'])
final_comparison['Prophet'] = [2.112, 3.144]
final_comparison['Naïve'] = [2.388, 3.759]
final_comparison['Moving average'] = [2.233, 3.578]
dfi.export(final_comparison, r'C:\Users\rcmpo\OneDrive\Escritorio\TFM\memory_utils\test_comparison.png')
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x = predictions.ds, y = predictions.y, name = 'Real'))
fig.add_trace(go.Scatter(x = predictions.ds, y = predictions.yhat, name = 'Prophet'))
fig.add_trace(go.Scatter(x = naive.index, y = naive.yhat, name = 'Naïve', marker = dict(color = 'orange')))
fig.add_trace(go.Scatter(x = moving_avg.index, y = moving_avg.yhat, name = 'Moving average', marker = dict(color = 'crimson')))
fig.show()
Now let's repeat the whole process with the second highway.
link_id = 365
query = f"SELECT * from tb_historical_traffic WHERE link_id = {link_id}"
db = DBConnection()
# Return the query to the local database as a DataFrame sorted by date.
df = db.perform_query(query).sort_values(by = 'measurement_date')
# Filter the columns that we need to the training and set the date as index to resample the DataFrame later.
df = df[['measurement_date', 'speed']]
df = df.set_index('measurement_date')
df.index = pd.to_datetime(df.index)
# Filter the outliers using the IQR approach.
q3, q1 = np.quantile(df['speed'], 0.75), np.quantile(df['speed'], 0.25)
iqr = q3 - q1
upper_limit, lower_limit = q3 + 1.5 * iqr, q1 - 1.5 * iqr
df = df[(df['speed'] > lower_limit) & (df['speed'] < upper_limit)]
# Resample the DataFrame by hour using the average of the observations.
df = df.resample('1H').mean()
df = df.reset_index(drop = False)
# Rename the columns to the names that Prophet supports.
df = df.rename(columns = {'measurement_date': 'ds', 'speed': 'y'})
df
Performing connection with database... Connection successful ! Performing query to database... Query finished ! Connection closed.
| ds | y | |
|---|---|---|
| 0 | 2016-12-28 09:00:00 | 14.910000 |
| 1 | 2016-12-28 10:00:00 | NaN |
| 2 | 2016-12-28 11:00:00 | NaN |
| 3 | 2016-12-28 12:00:00 | NaN |
| 4 | 2016-12-28 13:00:00 | NaN |
| ... | ... | ... |
| 45033 | 2022-02-16 18:00:00 | 11.335833 |
| 45034 | 2022-02-16 19:00:00 | 20.344167 |
| 45035 | 2022-02-16 20:00:00 | 25.264167 |
| 45036 | 2022-02-16 21:00:00 | 22.107500 |
| 45037 | 2022-02-16 22:00:00 | 26.405000 |
45038 rows × 2 columns
df['weekend'] = df['ds'].apply(is_weekend)
df['knicks_game'] = df['ds'].apply(match_in_madison_square_garden)
df
| ds | y | weekend | knicks_game | |
|---|---|---|---|---|
| 0 | 2016-12-28 09:00:00 | 14.910000 | 0 | 0 |
| 1 | 2016-12-28 10:00:00 | NaN | 0 | 0 |
| 2 | 2016-12-28 11:00:00 | NaN | 0 | 0 |
| 3 | 2016-12-28 12:00:00 | NaN | 0 | 0 |
| 4 | 2016-12-28 13:00:00 | NaN | 0 | 0 |
| ... | ... | ... | ... | ... |
| 45033 | 2022-02-16 18:00:00 | 11.335833 | 0 | 0 |
| 45034 | 2022-02-16 19:00:00 | 20.344167 | 0 | 0 |
| 45035 | 2022-02-16 20:00:00 | 25.264167 | 0 | 0 |
| 45036 | 2022-02-16 21:00:00 | 22.107500 | 0 | 0 |
| 45037 | 2022-02-16 22:00:00 | 26.405000 | 0 | 0 |
45038 rows × 4 columns
from fbprophet import Prophet
from fbprophet.serialize import model_to_json, model_from_json
param_grid = {'changepoint_prior_scale': [0.01, 0.1, 0.5],
'changepoint_range': [0.8, 0.9, 0.95],
'seasonality_mode': ['multiplicative', 'additive']}
# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
grid_search_df = pd.DataFrame()
avg_RMSEs = []
SAVE_MODEL = True
for params in all_params:
print(f'############## {params} ##############')
start_train_date = (datetime.now() - timedelta(days = 365)).replace(hour = 0, minute = 0, second = 0, microsecond = 0)
end_train_date = start_train_date + timedelta(days = 30 * 8)
stop_validation = end_train_date + timedelta(days = 30 * 3)
RMSEs = {}
period_n = 0
while True:
start_val_date = end_train_date
end_val_date = start_val_date + timedelta(days = 14)
if end_val_date > stop_validation:
print(f'\nDate limit ({stop_validation}) to validate data reached!')
end_val_date = start_val_date
break
print(f'Training data between {start_train_date}, and {end_train_date}')
train_data = df[(df['ds'] > start_train_date.strftime('%Y-%m-%d')) & (df['ds'] < end_train_date.strftime('%Y-%m-%d'))]
val_data = df[(df['ds'] > start_val_date.strftime('%Y-%m-%d')) & (df['ds'] < end_val_date.strftime('%Y-%m-%d'))]
m = Prophet(**params,
yearly_seasonality = True,
weekly_seasonality = True,
daily_seasonality = True,
holidays = nyc_holidays)
m.add_regressor('weekend')
m.add_regressor('knicks_game')
m.fit(train_data)
future = m.make_future_dataframe(periods = 360, freq = 'H')
future['weekend'] = future['ds'].apply(is_weekend)
future['knicks_game'] = future['ds'].apply(match_in_madison_square_garden)
forecast = m.predict(future)
forecast = forecast[(forecast['ds'] > start_val_date.isoformat()) & (forecast['ds'] < end_val_date.isoformat())]
metrics_df = val_data.merge(right = forecast, how = 'inner', left_on = 'ds', right_on = 'ds')[['ds', 'y', 'yhat']].dropna()
metrics_df = metrics_df.set_index('ds')
metrics_df = metrics_df.dropna()
actual, predicted = np.array(metrics_df['y'].values), np.array(metrics_df['yhat'].values)
rmse_ = rmse(actual, predicted)
RMSEs[period_n] = rmse_
print(f'Validation metrics between {start_val_date} and {end_val_date} -> RMSE: {rmse_}\n')
start_train_date = start_train_date + timedelta(days = 14)
end_train_date = end_train_date + timedelta(days = 14)
period_n += 1
grid_search_df = grid_search_df.append(RMSEs, ignore_index = True)
avg_rmse, std_rmse = np.round(np.mean([x for x in RMSEs.values()]), 3), np.round(np.std([x for x in RMSEs.values()]), 3)
avg_RMSEs.append(avg_rmse)
print('')
print(f'-> Average RMSE using 14-days Cross Validation: {avg_rmse} ± {std_rmse}')
print('')
print('All Cross Validation finished!')
grid_search_df['Avg. RMSE'] = avg_RMSEs
final_grid_search_df = pd.concat([pd.DataFrame(all_params), grid_search_df], axis = 1)
best_df = final_grid_search_df[final_grid_search_df.index == final_grid_search_df['Avg. RMSE'].idxmin()][[x for x in final_grid_search_df.columns if x not in [0, 1, 2, 3, 4, 5, 'Avg. RMSE']]]
best_params = best_df.to_dict(orient = 'records')[0]
print(f'\n --- Best params: {best_params} ---\n')
# Test data after the latest validation performed !
print(f'Testing data after {end_val_date} with optimal model...')
test_data = df[df['ds'] > end_val_date]
if SAVE_MODEL:
m = Prophet(**best_params,
yearly_seasonality = True,
weekly_seasonality = True,
daily_seasonality = True,
holidays = nyc_holidays)
m.add_regressor('weekend')
m.add_regressor('knicks_game')
m.fit(test_data)
print('Done!')
with open(f'C://Users//rcmpo//OneDrive//Escritorio//TFM//application//models//{link_id}.json', 'w') as fout:
json.dump(model_to_json(m), fout)
else:
with open(f'C://Users//rcmpo//OneDrive//Escritorio//TFM//application//models//{link_id}.json', 'r') as fin:
m = model_from_json(json.load(fin))
future = m.make_future_dataframe(periods = 360, freq = 'H')
future['weekend'] = future['ds'].apply(is_weekend)
future['knicks_game'] = future['ds'].apply(match_in_madison_square_garden)
forecast = m.predict(future)
predictions = test_data.merge(right = forecast[['ds', 'yhat']], how = 'inner', left_on = 'ds', right_on = 'ds')[['ds', 'y', 'yhat']].dropna()
print(f"RMSE for test data: {rmse(predictions['y'].values, predictions['yhat'].values)}")
predictions['mae'] = abs(predictions['y'] - predictions['yhat'])
print(f"MAE for test data: {np.mean(predictions['mae'])}")
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.8, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.661891882991734
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.9352011806155796
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.140181762028284
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 5.687998223523513
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 9.446084037043216
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 8.076470178501161
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 7.491 ± 1.213
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.8, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.891270214966242
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.459753379856594
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.930001107405019
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.429871017177495
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 7.936014849005898
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.449459191279394
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 7.349 ± 0.885
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.9, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.637427200394125
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.950098103892979
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.282869936233519
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 5.581853058915747
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 9.504252694932129
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.992698501993131
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 7.492 ± 1.265
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.8845543094736845
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.475015622215044
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.792170371372002
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.271316471562356
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 7.880442851262315
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.430348066788547
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 7.289 ± 0.865
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.621319247315348
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.779394265598186
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.002164000190573
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 5.604490587257373
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 9.60810127579831
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.962599177699075
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 7.43 ± 1.276
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.95, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.865071352204061
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.445237958139613
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.66352111488622
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.299018617372367
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 7.959561791137905
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.618813063420301
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 7.309 ± 0.848
############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.8, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.989328841041767
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 7.190773659776363
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.714018630427699
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 5.783276451614362
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 22.93528301143299
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 8.628398329724636
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 10.04 ± 5.854
############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.8, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.807099194154355
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 7.025946895722976
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.51758474698422
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.6143543282507515
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 20.200223115251287
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 8.926058216556061
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 9.849 ± 4.757
############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.9, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.964467961913226
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 7.214718572366223
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.67644195426523
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 5.689073892909414
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 22.90464600021188
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 11.157711578812698
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 10.435 ± 5.832
############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.807138482595291
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 7.049733589565095
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.593984454244891
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.647980459650669
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 17.277499095806817
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 8.889886178457935
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 9.378 ± 3.701
############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.943724491538316
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 9.146758103306716
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.673010714004679
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 5.75360633367981
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 23.78231270475493
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 13.48918456766125
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 11.298 ± 6.081
############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.95, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.808144009721109
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.995379546738166
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.536900364427702
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.610146363907281
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 20.84598350594019
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 8.656543244447256
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 9.909 ± 5.005
############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.8, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 7.663500325933629
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 21.443038300285114
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 12.821141731667423
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 21.287214232697533
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 22.16215545585089
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 13.576774706801904
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 16.492 ± 5.471
############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.8, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.895868414646396
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 22.208226184036253
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.562404700079158
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 46.245246420636946
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 22.339782599449375
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 11.11897506673142
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 19.728 ± 13.281
############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.9, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 10.317548914600524
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 20.77383418041363
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 11.719932983955722
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 20.608933445969893
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 21.16414388188165
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 14.515269914681356
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 16.517 ± 4.508
############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.87653301645642
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 27.08741334705353
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.51255761562299
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 28.471372300048365
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 23.85185625661355
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 11.287479842878222
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 17.848 ± 8.824
############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 7.1987690439216845
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 23.396497293317683
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 12.030027062259514
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 16.5733663737021
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 22.41944673865343
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 14.628800480129934
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 16.041 ± 5.647
############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.95, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.876698617622195
Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 30.21281677372825
Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.576718323577156
Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 37.76287014793415
Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 22.135387354270698
Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 11.072309959353191
Date limit (2022-01-22 00:00:00) to validate data reached!
-> Average RMSE using 14-days Cross Validation: 19.606 ± 11.431
All Cross Validation finished!
--- Best params: {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ---
Testing data after 2022-01-16 00:00:00 with optimal model...
Done!
RMSE for test data: 6.504893607080428
MAE for test data: 4.930203972191617
And for this second link road these are the results:
final_grid_search_df
| changepoint_prior_scale | changepoint_range | seasonality_mode | 0 | 1 | 2 | 3 | 4 | 5 | Avg. RMSE | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.01 | 0.80 | multiplicative | 6.661892 | 6.935201 | 8.140182 | 5.687998 | 9.446084 | 8.076470 | 7.491 |
| 1 | 0.01 | 0.80 | additive | 6.891270 | 6.459753 | 8.930001 | 6.429871 | 7.936015 | 7.449459 | 7.349 |
| 2 | 0.01 | 0.90 | multiplicative | 6.637427 | 6.950098 | 8.282870 | 5.581853 | 9.504253 | 7.992699 | 7.492 |
| 3 | 0.01 | 0.90 | additive | 6.884554 | 6.475016 | 8.792170 | 6.271316 | 7.880443 | 7.430348 | 7.289 |
| 4 | 0.01 | 0.95 | multiplicative | 6.621319 | 6.779394 | 8.002164 | 5.604491 | 9.608101 | 7.962599 | 7.430 |
| 5 | 0.01 | 0.95 | additive | 6.865071 | 6.445238 | 8.663521 | 6.299019 | 7.959562 | 7.618813 | 7.309 |
| 6 | 0.10 | 0.80 | multiplicative | 6.989329 | 7.190774 | 8.714019 | 5.783276 | 22.935283 | 8.628398 | 10.040 |
| 7 | 0.10 | 0.80 | additive | 6.807099 | 7.025947 | 9.517585 | 6.614354 | 20.200223 | 8.926058 | 9.849 |
| 8 | 0.10 | 0.90 | multiplicative | 6.964468 | 7.214719 | 8.676442 | 5.689074 | 22.904646 | 11.157712 | 10.435 |
| 9 | 0.10 | 0.90 | additive | 6.807138 | 7.049734 | 9.593984 | 6.647980 | 17.277499 | 8.889886 | 9.378 |
| 10 | 0.10 | 0.95 | multiplicative | 6.943724 | 9.146758 | 8.673011 | 5.753606 | 23.782313 | 13.489185 | 11.298 |
| 11 | 0.10 | 0.95 | additive | 6.808144 | 6.995380 | 9.536900 | 6.610146 | 20.845984 | 8.656543 | 9.909 |
| 12 | 0.50 | 0.80 | multiplicative | 7.663500 | 21.443038 | 12.821142 | 21.287214 | 22.162155 | 13.576775 | 16.492 |
| 13 | 0.50 | 0.80 | additive | 6.895868 | 22.208226 | 9.562405 | 46.245246 | 22.339783 | 11.118975 | 19.728 |
| 14 | 0.50 | 0.90 | multiplicative | 10.317549 | 20.773834 | 11.719933 | 20.608933 | 21.164144 | 14.515270 | 16.517 |
| 15 | 0.50 | 0.90 | additive | 6.876533 | 27.087413 | 9.512558 | 28.471372 | 23.851856 | 11.287480 | 17.848 |
| 16 | 0.50 | 0.95 | multiplicative | 7.198769 | 23.396497 | 12.030027 | 16.573366 | 22.419447 | 14.628800 | 16.041 |
| 17 | 0.50 | 0.95 | additive | 6.876699 | 30.212817 | 9.576718 | 37.762870 | 22.135387 | 11.072310 | 19.606 |
Note that in this case results are worse than before.
Best parameters are:
And with these parameters we get a metrics on test data of:
Let's export again this second DataFrame:
import dataframe_image as dfi
dfi.export(final_grid_search_df, r'C:\Users\rcmpo\OneDrive\Escritorio\TFM\memory_utils\final_grid_search_df_2.png', max_rows = 20)
Let's compare again the results with the other 2 models:
naive = pd.concat([val_data, test_data]).reset_index(drop = True)
naive = naive[naive['ds'] > '2022-01-14 00:00:00']
naive = naive.set_index('ds')
naive = naive.sort_index()
naive['yhat'] = naive['y'].shift(periods = 24, freq = 'H')
naive = naive.dropna()
naive['mae'] = abs(naive['y'] - naive['yhat'])
print(f"MAE for test data: {np.mean(naive['mae'])}")
rmse_ = rmse(naive['y'].values, naive['yhat'].values)
print(f"RMSE for test data: {rmse_}")
MAE for test data: 6.1816020670806155 RMSE for test data: 8.27205117029201
moving_avg = pd.concat([train_data, val_data, test_data]).reset_index(drop = True)
moving_avg = moving_avg.drop(columns = ['weekend', 'knicks_game'])
moving_avg['yhat'] = None
moving_avg = moving_avg.set_index('ds')
moving_avg = moving_avg.sort_index()
moving_avg = moving_avg[moving_avg.index > '2021-12-01']
for idx in moving_avg.index:
previous_dates = get_previous_dates_given_a_date(idx)
speeds = moving_avg[moving_avg.index.isin(previous_dates)]['y'].values
avg = np.mean([x for x in speeds if str(x) != 'nan'])
moving_avg.at[idx, 'yhat'] = avg
moving_avg = moving_avg[moving_avg.index > '2022-01-15 00:00:00']
moving_avg = moving_avg.dropna()
moving_avg['mae'] = abs(moving_avg['y'] - moving_avg['yhat'])
print(f"MAE for test data: {np.mean(moving_avg['mae'])}")
rmse_ = rmse(moving_avg['y'].values, moving_avg['yhat'].values)
print(f"RMSE for test data: {rmse_}")
C:\Users\rcmpo\Anaconda3\envs\tfm_\lib\site-packages\numpy\core\fromnumeric.py:3440: RuntimeWarning: Mean of empty slice. C:\Users\rcmpo\Anaconda3\envs\tfm_\lib\site-packages\numpy\core\_methods.py:189: RuntimeWarning: invalid value encountered in double_scalars
MAE for test data: 5.743760804026095 RMSE for test data: 7.870923289096443
And let's see the final comparison. Prophet is again the best alternative.
final_comparison = pd.DataFrame(index = ['MAE', 'RMSE'])
final_comparison['Prophet'] = [4.930, 6.505]
final_comparison['Naïve'] = [6.182, 8.272]
final_comparison['Moving average'] = [5.744, 7.871]
dfi.export(final_comparison, r'C:\Users\rcmpo\OneDrive\Escritorio\TFM\memory_utils\test_comparison_2.png')
final_comparison
| Prophet | Naïve | Moving average | |
|---|---|---|---|
| MAE | 4.930 | 6.182 | 5.744 |
| RMSE | 6.505 | 8.272 | 7.871 |
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x = predictions.ds, y = predictions.y, name = 'Real'))
fig.add_trace(go.Scatter(x = predictions.ds, y = predictions.yhat, name = 'Prophet'))
fig.add_trace(go.Scatter(x = naive.index, y = naive.yhat, name = 'Naïve', marker = dict(color = 'orange')))
fig.add_trace(go.Scatter(x = moving_avg.index, y = moving_avg.yhat, name = 'Moving average', marker = dict(color = 'crimson')))
fig.show()